Statistical Techniques Project

Install Packages

install.packages("Metrics")
install.packages("tidymodels")
install.packages("ggplot2")
install.packages("dplyr")
install.packages("naniar")
install.packages("tidyverse")
install.packages("kableExtra")
install.packages("lubridate")
install.packages("readxl")
install.packages("highcharter")
install.packages("scales")
install.packages("RColorBrewer")
install.packages("wesanderson")
install.packages("plotly")
install.packages("shiny")
install.packages("readr")
install.packages("choroplethr")
install.packages("choroplethrMaps")
install.packages("GGally")
install.packages("zoo")
install.packages("scales")
install.packages("ggmap")
install.packages("stringr")
install.packages("gridExtra")
install.packages("caret")
install.packages("treemap")
install.packages("psych")
install.packages("DAAG")
install.packages("leaps")
install.packages("corrplot")
install.packages("glmnet")
install.packages("boot")
install.packages("ggpubr")
install.packages("gplots")
install.packages("nortest")

Loading Packages

Importing Dataset in varibale “hr” using ‘read.csv’

hr <- read.csv("HR_comma_sep.csv")
hr <- data.frame(hr)

Displaying first 6 observations of the Dataset using ‘head’

head(hr)
##   satisfaction_level last_evaluation number_project average_montly_hours
## 1               0.38            0.53              2                  157
## 2               0.80            0.86              5                  262
## 3               0.11            0.88              7                  272
## 4               0.72            0.87              5                  223
## 5               0.37            0.52              2                  159
## 6               0.41            0.50              2                  153
##   time_spend_company Work_accident left promotion_last_5years Department salary
## 1                  3             0    1                     0      sales    low
## 2                  6             0    1                     0      sales medium
## 3                  4             0    1                     0      sales medium
## 4                  5             0    1                     0      sales    low
## 5                  3             0    1                     0      sales    low
## 6                  3             0    1                     0      sales    low

Checking Dimensionality

dim(hr)
## [1] 14999    10

Column Names

colnames(hr)
##  [1] "satisfaction_level"    "last_evaluation"       "number_project"       
##  [4] "average_montly_hours"  "time_spend_company"    "Work_accident"        
##  [7] "left"                  "promotion_last_5years" "Department"           
## [10] "salary"

Changing Character Data Types to Categorical Variables

As we see, in column names there are some dimensions that has some categorical data. Initially when data is loaded they are read as character data type. In order to work on such variables their Data Types needs to be converted to Factor.

hr <- as.data.frame(unclass(hr), stringsAsFactors = TRUE)
str(hr)
## 'data.frame':    14999 obs. of  10 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ salary               : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...

Checking the Null Values in the Dataset

Now we check the number of null values and variables that consist of null values in the dataset.

sum(is.na(hr))
## [1] 0

Summary of the dataset

summary(hr)
##  satisfaction_level last_evaluation  number_project  average_montly_hours
##  Min.   :0.0900     Min.   :0.3600   Min.   :2.000   Min.   : 96.0       
##  1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0       
##  Median :0.6400     Median :0.7200   Median :4.000   Median :200.0       
##  Mean   :0.6128     Mean   :0.7161   Mean   :3.803   Mean   :201.1       
##  3rd Qu.:0.8200     3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0       
##  Max.   :1.0000     Max.   :1.0000   Max.   :7.000   Max.   :310.0       
##                                                                          
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   Min.   :0.0000   Min.   :0.00000      
##  1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000      
##  Median : 3.000     Median :0.0000   Median :0.0000   Median :0.00000      
##  Mean   : 3.498     Mean   :0.1446   Mean   :0.2381   Mean   :0.02127      
##  3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :10.000     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000      
##                                                                            
##        Department      salary    
##  sales      :4140   high  :1237  
##  technical  :2720   low   :7316  
##  support    :2229   medium:6446  
##  IT         :1227                
##  product_mng: 902                
##  marketing  : 858                
##  (Other)    :2923
glimpse(hr)
## Rows: 14,999
## Columns: 10
## $ satisfaction_level    <dbl> 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, ~
## $ last_evaluation       <dbl> 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, ~
## $ number_project        <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, ~
## $ average_montly_hours  <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142~
## $ time_spend_company    <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, ~
## $ Work_accident         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ left                  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Department            <fct> sales, sales, sales, sales, sales, sales, sales,~
## $ salary                <fct> low, medium, medium, low, low, low, low, low, lo~

Missing Values using Graphical Representation

Here we can graphically visualize the null values in each attribute

gg_miss_var(hr)

Heat Plot of Missing Values

Heat plot that clearly mention the features containing null values and overall percentage of missing and present values.

vis_miss(hr) + theme(axis.text.x = element_text(angle = 90))

Exploratory Data Analysis

Analysis of Departments

This pie chart is used to find the types of departments in a company along with their percentages.

hr_dept <- data.frame(table(hr$Department))
hr_dept <- hr_dept[,c('Var1', 'Freq')]
fig <- plot_ly(hr_dept, labels = ~Var1, values = ~Freq, type = 'pie')
fig
27.6%18.1%14.9%8.18%6.01%5.72%5.25%5.11%4.93%4.2%
salestechnicalsupportITproduct_mngmarketingRandDaccountinghrmanagement

Salary Analysis of Each Department

# Group department variable with salary.
salary_dept <-  hr %>% 
  group_by(Department, salary) %>% 
  summarize(Freq = n())

# Filtering salary and grouping it with particular department
salary_dept_1 <-  hr %>% 
    filter(salary %in% c("high","medium","low")) %>% 
    group_by(Department) %>% 
    summarize(sum = n())

# Merging both variables in order to visualize and plot
salary_ratio <- merge (salary_dept, salary_dept_1, by="Department")

salary_ratio <- salary_ratio %>% 
  mutate(ratio = Freq/sum)

# Plot
ggplot(salary_ratio, aes(x=Department, y = ratio, fill = salary)) + geom_bar(position = "dodge", stat="identity") + 
  xlab("Department") + ylab ("Salary") +
  scale_fill_discrete(name = "Salary Categories") +
  scale_y_continuous(labels = scales::percent) +
  coord_flip()

Satisfaction level comparison among each department.

hr %>% 
  group_by(Department) %>% 
  summarise(mean_satisfaction_level = mean(satisfaction_level, na.rm = TRUE)) %>% 
  ggplot(aes(x = reorder(Department, mean_satisfaction_level), y = mean_satisfaction_level, fill = Department)) +
  geom_col(stat ="identity", color = "black", fill="#ba3a9e") +
  coord_flip() +
  theme_gray() +
  labs(x = "Department", y = "Satisfaction Level") +
  geom_text(aes(label = round(mean_satisfaction_level,digit = 4)), hjust = 2.0, color = "white", size = 3.5) +
  ggtitle("Satisfaction Level for each Department", subtitle = "Satisfaction Level vs Department") + 
  xlab("Department") + 
  ylab("Satisfaction Level") +
  theme(legend.position = "none",
        plot.title = element_text(color = "black", size = 14, face = "bold", hjust = 1),
        plot.subtitle = element_text(color = "black", hjust = 0.5),
        axis.title.y = element_text(),
        axis.title.x = element_text(),
        axis.ticks = element_blank())

Satisfaction Level for each department

hr %>% 
  filter(!(is.na(salary))) %>% 
  filter(!(salary == "Unknown")) %>% 
  group_by(salary) %>% 
  summarise(mean_satisfaction_level = mean(satisfaction_level, na.rm = TRUE)) %>% 
  ggplot(aes(x = reorder(salary, mean_satisfaction_level), y = mean_satisfaction_level, fill = salary)) +
  geom_col(stat ="identity", color = "black", fill="#b9d2fa") +
  coord_flip() +
  theme_gray() +
  labs(x = "Salary", y = "Satisfaction Level") +
  geom_text(aes(label = round(mean_satisfaction_level,digit = 4)), hjust = 2.0, color = "black", size = 3.5) +
  ggtitle("Mean Satisfaction Level comparison with Salary", subtitle = "Satisfaction Level vs Salary") + 
  xlab("Salary") + 
  ylab("Satisfaction Level") +
  theme(legend.position = "none",
        plot.title = element_text(color = "black", size = 14, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(color = "black", hjust = 0.5),
        axis.title.y = element_text(),
        axis.title.x = element_text(),
        axis.ticks = element_blank())
## Warning: Ignoring unknown parameters: stat

## Coorelation Matrix of numeric dimensions Correlation plot is made to find relationship among features.

hr.corr <- hr %>% 
  select(satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years)

cor(hr.corr) # get the correlation matrix
##                       satisfaction_level last_evaluation number_project
## satisfaction_level            1.00000000     0.105021214   -0.142969586
## last_evaluation               0.10502121     1.000000000    0.349332589
## number_project               -0.14296959     0.349332589    1.000000000
## average_montly_hours         -0.02004811     0.339741800    0.417210634
## time_spend_company           -0.10086607     0.131590722    0.196785891
## Work_accident                 0.05869724    -0.007104289   -0.004740548
## left                         -0.38837498     0.006567120    0.023787185
## promotion_last_5years         0.02560519    -0.008683768   -0.006063958
##                       average_montly_hours time_spend_company Work_accident
## satisfaction_level            -0.020048113       -0.100866073   0.058697241
## last_evaluation                0.339741800        0.131590722  -0.007104289
## number_project                 0.417210634        0.196785891  -0.004740548
## average_montly_hours           1.000000000        0.127754910  -0.010142888
## time_spend_company             0.127754910        1.000000000   0.002120418
## Work_accident                 -0.010142888        0.002120418   1.000000000
## left                           0.071287179        0.144822175  -0.154621634
## promotion_last_5years         -0.003544414        0.067432925   0.039245435
##                              left promotion_last_5years
## satisfaction_level    -0.38837498           0.025605186
## last_evaluation        0.00656712          -0.008683768
## number_project         0.02378719          -0.006063958
## average_montly_hours   0.07128718          -0.003544414
## time_spend_company     0.14482217           0.067432925
## Work_accident         -0.15462163           0.039245435
## left                   1.00000000          -0.061788107
## promotion_last_5years -0.06178811           1.000000000
corrplot(cor(hr.corr), method= "circle", bg = "black") # put this in a nice table

Statistical Tests

1. Hypothesis Test

Before performing any technique we need to perform some preliminary tests.

Checking NORMALITY of the dataset

Null Hypothesis: Data is normally distributed Alternative Hypothesis: Data is not normally distributed

Graphical Analysis for Normality Check

Histogram

par(mfrow =  c(2,2))
hist(hr$satisfaction_level, freq= TRUE, main = paste("Histogram of Satisfaction Level"), xlab = 'satisfaction_level', col = '#32a88d')
hist(hr$last_evaluation, freq= TRUE, main = paste("Histogram of Last Evaluation"), xlab = 'satisfaction_level', col = '#3298a8')
hist(hr$average_montly_hours, freq= TRUE, main = paste("Histogram of Average Monthly Hours"), xlab = 'satisfaction_level', col = '#326da8')
hist(hr$number_project, freq= TRUE, main = paste("Histogram of Number of Projects"), xlab = 'satisfaction_level', col = '#5732a8')

## Boxplot

par(mfrow = c(2,2))
boxplot(hr$satisfaction_level, xlab = 'satifaction_level', col = '#32a88d')
boxplot(hr$last_evaluation, xlab = 'last_evaluation', col = '#3298a8')
boxplot(hr$average_montly_hours, xlab = 'average_montly_hours', col = '#326da8')
boxplot(hr$number_project, xlab = 'number_project', col = '#5732a8')

## QQ-Plot

qqnorm(hr$satisfaction_level)
qqline(hr$satisfaction_level, col = 'blue' )

## GGQQPLOT

ggqqplot(hr$satisfaction_level)

From the above plots one can conclude that data for satisfaction level is right skewed, which means that distribution is not normal. Hence, we will now implement non_parametric test i.e. Mann-Whitney U-test instead of T-test (Parametric Test).

Statistical Test to check Normality

summary(hr$satisfaction_level)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0900  0.4400  0.6400  0.6128  0.8200  1.0000
nortest::ad.test(hr$satisfaction_level)
## 
##  Anderson-Darling normality test
## 
## data:  hr$satisfaction_level
## A = 168.37, p-value < 2.2e-16
shapiro.test(hr$satisfaction_level[0:5000])
## 
##  Shapiro-Wilk normality test
## 
## data:  hr$satisfaction_level[0:5000]
## W = 0.94543, p-value < 2.2e-16
shapiro.test(hr$satisfaction_level[5001:10000])
## 
##  Shapiro-Wilk normality test
## 
## data:  hr$satisfaction_level[5001:10000]
## W = 0.9563, p-value < 2.2e-16
shapiro.test(hr$satisfaction_level[10001:14999])
## 
##  Shapiro-Wilk normality test
## 
## data:  hr$satisfaction_level[10001:14999]
## W = 0.95056, p-value < 2.2e-16

As the p-value is much smaller than threshold value i.e. 0.05. So we can reject null hypothesis and go with the alternative hypothesis. Alternative Hypothesis: Data is not normally distributed

Here we implemented shapiro-wilk test three times as this test has some limitation that it can only be implemented on 5000 observations so we used test three times according to dataset. Moreover, Anderson_Darling test is also applied to cross check the normality of data.

Two Independent Samples, Mann-Whitney U-Test

In this test, selected dimensions are satisfaction_level as metric variable and left as nominal variable.

Null Hypothesis: Left has no influence on satisfaction_level Alternative Hypothesis: Left has influence on satisfaction_level

wilcox.test(hr$satisfaction_level~hr$left)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  hr$satisfaction_level by hr$left
## W = 30522915, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

A Mann-Whitney U-test showed that left has significant influence (W = 30522915, p-value = 2.2e-16) on satisfaction level.

Comparison Box Plot

fig <- plot_ly(y = hr$satisfaction_level, x = hr$left, type = "box")
fig <- fig%>%layout(title = "Satisfaction_Level vs Left")
fig

Chi-Squared Test

1st Chi-Sq Test

In this test, we use categorical variables in order to find some relationship among the variables. Here, department and promotion_last_5years are used for testing

chisq_table <- table(hr$Department, hr$promotion_last_5years)
balloonplot(t(chisq_table), main = "Data Analysis for Chi Squared Test", label=FALSE, show.margins = FALSE, xlab = "Promotion Last 5 Years", ylab = "Department")

Implementing Chi-sq test using R formula

chisq <- chisq.test(chisq_table)
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  chisq_table
## X-squared = 350.91, df = 9, p-value < 2.2e-16

Observed Frequencies

chisq$observed
##              
##                  0    1
##   accounting   753   14
##   hr           724   15
##   IT          1224    3
##   management   561   69
##   marketing    815   43
##   product_mng  902    0
##   RandD        760   27
##   sales       4040  100
##   support     2209   20
##   technical   2692   28

Expected Frequencies

round(chisq$expected, 0)
##              
##                  0  1
##   accounting   751 16
##   hr           723 16
##   IT          1201 26
##   management   617 13
##   marketing    840 18
##   product_mng  883 19
##   RandD        770 17
##   sales       4052 88
##   support     2182 47
##   technical   2662 58

Residuals

round(chisq$residuals, 3)
##              
##                    0      1
##   accounting   0.084 -0.573
##   hr           0.027 -0.181
##   IT           0.666 -4.521
##   management  -2.239 15.190
##   marketing   -0.854  5.794
##   product_mng  0.646 -4.380
##   RandD       -0.370  2.508
##   sales       -0.188  1.274
##   support      0.587 -3.980
##   technical    0.579 -3.924

Correlation Plot of Residuals

corrplot(chisq$residuals, is.cor = FALSE)

2nd Chi-Sq Test

Now here department and salary are used for testing

chisq_table1 <- table(hr$Department, hr$salary)
balloonplot(t(chisq_table1), main = "Data Analysis for Chi Squared Test", label=FALSE, show.margins = FALSE, xlab = "Salary", ylab = "Department")

Chi-sq test using R formula

chisq_1 <- chisq.test(chisq_table1)
chisq_1
## 
##  Pearson's Chi-squared test
## 
## data:  chisq_table1
## X-squared = 700.92, df = 18, p-value < 2.2e-16

Observed Values

chisq_1$observed
##              
##               high  low medium
##   accounting    74  358    335
##   hr            45  335    359
##   IT            83  609    535
##   management   225  180    225
##   marketing     80  402    376
##   product_mng   68  451    383
##   RandD         51  364    372
##   sales        269 2099   1772
##   support      141 1146    942
##   technical    201 1372   1147

Expected Values

round(chisq_1$expected, 2)
##              
##                 high     low  medium
##   accounting   63.26  374.12  329.63
##   hr           60.95  360.46  317.59
##   IT          101.19  598.49  527.32
##   management   51.96  307.29  270.75
##   marketing    70.76  418.50  368.74
##   product_mng  74.39  439.96  387.65
##   RandD        64.91  383.87  338.22
##   sales       341.43 2019.35 1779.21
##   support     183.83 1087.23  957.94
##   technical   224.32 1326.72 1168.95

Residuals

round(chisq_1$residuals, 3)
##              
##                 high    low medium
##   accounting   1.351 -0.833  0.296
##   hr          -2.043 -1.341  2.323
##   IT          -1.809  0.430  0.335
##   management  24.007 -7.262 -2.780
##   marketing    1.098 -0.807  0.378
##   product_mng -0.741  0.526 -0.236
##   RandD       -1.726 -1.014  1.837
##   sales       -3.920  1.772 -0.171
##   support     -3.159  1.782 -0.515
##   technical   -1.557  1.243 -0.642

Correlation Plot of Residuals

corrplot(chisq_1$residuals, is.cor = FALSE)

Next test is One Factor ANOVA test but As the data is not normally distributed so we will use Kruskal Wallis Test. ## Kruskal Wallis Test In order to implement kruskal wallis test, Department and satisfaction_level are the selected dimensions.

Null Hypothesis: No significant differences between the satisfaction level of employees among various department Alternative Hypothesis: There is a significant differences between the satisfaction level of employees among various department

Dept <- hr$Department
SL <- hr$satisfaction_level

KWT_data <- data.frame(Dept, SL)
levels(KWT_data$Dept)
##  [1] "accounting"  "hr"          "IT"          "management"  "marketing"  
##  [6] "product_mng" "RandD"       "sales"       "support"     "technical"
KWT_data$Dept <- ordered(KWT_data$Dept, levels = c("accounting","hr","IT","management","marketing","product_mng","RandD","sales","support","technical"))
group_by(KWT_data, Dept) %>%
  summarise(count = n(),mean = mean(KWT_data$SL),sd = sd(KWT_data$SL),median = median(KWT_data$SL),IQR = IQR(KWT_data$SL))
## # A tibble: 10 x 6
##    Dept        count  mean    sd median   IQR
##    <ord>       <int> <dbl> <dbl>  <dbl> <dbl>
##  1 accounting    767 0.613 0.249   0.64  0.38
##  2 hr            739 0.613 0.249   0.64  0.38
##  3 IT           1227 0.613 0.249   0.64  0.38
##  4 management    630 0.613 0.249   0.64  0.38
##  5 marketing     858 0.613 0.249   0.64  0.38
##  6 product_mng   902 0.613 0.249   0.64  0.38
##  7 RandD         787 0.613 0.249   0.64  0.38
##  8 sales        4140 0.613 0.249   0.64  0.38
##  9 support      2229 0.613 0.249   0.64  0.38
## 10 technical    2720 0.613 0.249   0.64  0.38
fig <- plot_ly(ggplot2::diamonds, y = ~KWT_data$SL, x=~KWT_data$Dept, type = "box",  color = KWT_data$Dept)
fig
accountinghrITmanagementmarketingproduct_mngRandDsalessupporttechnical0.20.40.60.81
technicalsupportsalesRandDproduct_mngmarketingmanagementIThraccountingKWT_data$DeptKWT_data$SL
ggline(KWT_data, x = "Dept", y = "SL", 
       add = c("mean_se", "jitter"), 
       order = c("accounting","hr","IT","management","marketing","product_mng","RandD","sales","support","technical"),
       ylab = "Satisfaction Level", xlab = "Departments") + theme(axis.text.x = element_text(angle = 45, hjust = 0.4, vjust = 0.5))

kruskal.test(KWT_data$SL ~ KWT_data$Dept)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  KWT_data$SL by KWT_data$Dept
## Kruskal-Wallis chi-squared = 18.296, df = 9, p-value = 0.03189

The result depicts that p-value is less than significance level, so we can conclude that there are significant differences of satisfaction level among employees of different departments but as we can also see that there isn’t a big difference within calculated p-value and significance level, in that case we can say as well that there is a small difference in satisfaction level among employees of different departments but not the massive difference.

Correlation

For correlation, two variables are taken under consideration which includes “time_spend_company”, “work_accident” and “number_project”. Furthermore, we have to find whether to implement non-parametric or parametric test. For this, normality of considered variables will be checked.

par(mfrow = c(1,2))
hist(hr$number_project , freq= TRUE, main = paste("Histogram of Number of Porjects"), xlab = 'number_project', col = '#53b56d')
hist(hr$time_spend_company , freq= TRUE, main = paste("Histogram of Time_Spend_Company"), xlab = 'Time_Spend_Company', col = '#53b56d')

Above graphs depicts that data is not normally distributed, in fact, it’s left skewed. For more clarification, we visualize QQ plot and perform normality test as well.

QQ-Plot for “number_project”

qqnorm(hr$number_project)
qqline(hr$number_project, col = 'blue' )

QQ-Plot for “time_spend_company”

qqnorm(hr$time_spend_company)
qqline(hr$time_spend_company, col = 'blue' )

summary(hr$number_project)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   4.000   3.803   5.000   7.000
nortest::ad.test(hr$number_project)
## 
##  Anderson-Darling normality test
## 
## data:  hr$number_project
## A = 444.73, p-value < 2.2e-16
summary(hr$time_spend_company)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   3.498   4.000  10.000
nortest::ad.test(hr$time_spend_company)
## 
##  Anderson-Darling normality test
## 
## data:  hr$time_spend_company
## A = 930.99, p-value < 2.2e-16
ggscatter(hr, x = "number_project", y = "time_spend_company", 
          add = "reg.line", conf.int = TRUE, 
          cor.coef = TRUE, cor.method = "pearson",
          xlab = "Number of Projects", ylab = "Time Spend in Company")
## `geom_smooth()` using formula 'y ~ x'

cor.test(hr$number_project, hr$time_spend_company,  method = "spearman", exact = FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  hr$number_project and hr$time_spend_company
## S = 4.2068e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.251971

Linear Regreesion

HR <- hr

y <- HR %>% select(satisfaction_level)
x <- HR %>% select(number_project, last_evaluation, time_spend_company, salary, Work_accident)

set.seed(0)
HR_split <- initial_split(HR, prop = 3/4)
HR_training <- HR_split %>% 
  training()
HR_test <- HR_split %>% 
  testing()

train_index <- as.integer(rownames(HR_training))
test_index <- as.integer(rownames(HR_test))

HR[train_index,'split'] = 'train'
HR[test_index,'split'] = 'test'

lm_model <- linear_reg() %>% 
  set_engine('lm') %>% 
  set_mode('regression') %>%
  fit(satisfaction_level ~ number_project + time_spend_company + salary + Work_accident  + last_evaluation, data = HR_training) 

prediction <- lm_model %>%
  predict(x) 
colnames(prediction) <- c('prediction')
HR = cbind(HR, prediction)

hist_top <- ggplot(HR,aes(x=satisfaction_level)) + 
  geom_histogram(data=subset(HR,split == 'train'),fill = "red", alpha = 0.2, bins = 6) +
  geom_histogram(data=subset(HR,split == 'test'),fill = "blue", alpha = 0.2, bins = 6) +
  theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())
hist_top <- ggplotly(p = hist_top)

scatter <- ggplot(HR, aes(x = satisfaction_level, y = prediction, color =  split)) +
 geom_point() +
 geom_smooth(formula=y ~ x, method=lm, se=FALSE)
scatter <- ggplotly(p = scatter, type = 'scatter')

hist_right <- ggplot(HR,aes(x=prediction)) +
 geom_histogram(data=subset(HR,split == 'train'),fill = "red", alpha = 0.2, bins = 13) +
 geom_histogram(data=subset(HR,split == 'test'),fill = "blue", alpha = 0.2, bins = 13) +
 theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())+
 coord_flip()
hist_right <- ggplotly(p = hist_right)

s <- subplot(
 hist_top,
 plotly_empty(),
 scatter,
 hist_right,
 nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2), margin = 0,
 shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE
)
## Warning: No trace type specified and no positional attributes specified
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: Can only have one: config
layout(s, showlegend = FALSE)
0.250.500.751.000.40.50.60.70.8
satisfaction_levelprediction

Training Model with Training dataset

lm_model <- lm(satisfaction_level ~ number_project + time_spend_company + salary + Work_accident  + last_evaluation, data = HR_training )
summary(lm_model)
## 
## Call:
## lm(formula = satisfaction_level ~ number_project + time_spend_company + 
##     salary + Work_accident + last_evaluation, data = HR_training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.63834 -0.18607  0.01932  0.19455  0.60152 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.644361   0.013383  48.149  < 2e-16 ***
## number_project     -0.037271   0.001989 -18.741  < 2e-16 ***
## time_spend_company -0.014319   0.001594  -8.985  < 2e-16 ***
## salarylow          -0.039653   0.008516  -4.656 3.26e-06 ***
## salarymedium       -0.017722   0.008597  -2.061   0.0393 *  
## Work_accident       0.043528   0.006414   6.786 1.21e-11 ***
## last_evaluation     0.252203   0.014177  17.790  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2401 on 11242 degrees of freedom
## Multiple R-squared:  0.05756,    Adjusted R-squared:  0.05706 
## F-statistic: 114.4 on 6 and 11242 DF,  p-value: < 2.2e-16

R squared

summary(lm_model)$r.squared
## [1] 0.05756131

Adjusted R squared

summary(lm_model)$adj.r.squared
## [1] 0.05705832

Plot Linear Model

par(mfrow = c(2,2))
plot(lm_model)

Predicting from model using Testing Dataset

prediction <- predict(object = lm_model, newdata = HR_test)

Testing accuracy of the data using mean absolute error

Metrics::mae(prediction, HR_test$satisfaction_level)
## [1] 0.205784

Mean Absolute Error is 0.2.This indicates that the average absolute difference between the observed values and the predicted values is approximately 0.2.

Logistic Regression

LR_HR <- hr

LR_HR$salary_0 <- ifelse(LR_HR$salary == 'low', 1, 0)
LR_HR$salary_1 <- ifelse(LR_HR$salary == 'medium', 1, 0)
LR_HR$salary_2 <- ifelse(LR_HR$salary == 'high', 1, 0)

LR_HR <- LR_HR %>%
  select(-salary,-Department,-Work_accident)
LR_HR <- data.frame(LR_HR)
colnames(LR_HR)
##  [1] "satisfaction_level"    "last_evaluation"       "number_project"       
##  [4] "average_montly_hours"  "time_spend_company"    "left"                 
##  [7] "promotion_last_5years" "salary_0"              "salary_1"             
## [10] "salary_2"

Split the data into training and test set

set.seed(123)
training.samples <- LR_HR$satisfaction_level %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- LR_HR[training.samples, ]
test.data <- LR_HR[-training.samples, ]

Training model with all predictive variables

model <- glm( left ~.+0, 
              data = data.frame(train.data))
summary(model)$coef
##                            Estimate   Std. Error    t value      Pr(>|t|)
## satisfaction_level    -0.6587603939 1.446215e-02 -45.550665  0.000000e+00
## last_evaluation        0.0925642591 2.266551e-02   4.083927  4.456870e-05
## number_project        -0.0353490138 3.296096e-03 -10.724511  1.032118e-26
## average_montly_hours   0.0006283654 7.903627e-05   7.950343  2.026222e-15
## time_spend_company     0.0359188298 2.450352e-03  14.658643  3.098454e-48
## promotion_last_5years -0.1262921114 2.462190e-02  -5.129259  2.954252e-07
## salary_0               0.5150207116 2.092876e-02  24.608277 1.722938e-130
## salary_1               0.4284465899 2.122426e-02  20.186647  3.852667e-89
## salary_2               0.2979233185 2.388676e-02  12.472320  1.754324e-35

Probabilities

prob <- model %>% predict(test.data, type="response")
head(prob)
##         5        10        24        25        39        46 
## 0.4563813 0.4136868 0.3891538 0.4369158 0.6488003 0.1821503

Prediction

pred <- ifelse(prob > 0.5, 1,0)
head(pred)
##  5 10 24 25 39 46 
##  0  0  0  0  1  0

Accuracy

mean(pred == test.data$left)
## [1] 0.7732578